— 1. Setup —

1.1 Load Libraries

# Define the list of packages
my_packages <- c(
## Data loading, cleaning, manipulation
"tidyverse", "dplyr", "data.table", "arrow", "purrr", "visdat",

## Visualization
"ggplot2", "ggrepel", "ggdist", "gghalves", "patchwork",
"ComplexHeatmap", "circlize", "dendextend",

## Descriptive statistics and summaries
"table1",

## Modeling and statistics
"reshape2", "fixest", "tictoc", "lme4", "lmerTest",
"broom.mixed", "effectsize", "modelsummary", "multcomp"
)
 
# Install any missing packages  
installed_packages <- rownames(installed.packages())
missing_packages <- my_packages[!my_packages %in% installed_packages]
if(length(missing_packages) > 0) install.packages(missing_packages)

# Load the libraries and store the results (TRUE for success, FALSE for failure)
loaded <- sapply(my_packages, require, character.only = TRUE)
cat("✅ Package loading summary:\n")
## ✅ Package loading summary:
loaded_df <- data.frame(Package = names(loaded), Loaded = loaded)
print(loaded_df)
##                       Package Loaded
## tidyverse           tidyverse   TRUE
## dplyr                   dplyr   TRUE
## data.table         data.table   TRUE
## arrow                   arrow   TRUE
## purrr                   purrr   TRUE
## visdat                 visdat   TRUE
## ggplot2               ggplot2   TRUE
## ggrepel               ggrepel   TRUE
## ggdist                 ggdist   TRUE
## gghalves             gghalves   TRUE
## patchwork           patchwork   TRUE
## ComplexHeatmap ComplexHeatmap   TRUE
## circlize             circlize   TRUE
## dendextend         dendextend   TRUE
## table1                 table1   TRUE
## reshape2             reshape2   TRUE
## fixest                 fixest   TRUE
## tictoc                 tictoc   TRUE
## lme4                     lme4   TRUE
## lmerTest             lmerTest   TRUE
## broom.mixed       broom.mixed   TRUE
## effectsize         effectsize   TRUE
## modelsummary     modelsummary   TRUE
## multcomp             multcomp   TRUE

— 2. Data Preparation —

2.1 Loading and Merging Main Datasets

This section loads the core metrics datasets (post-level and topic-level) and the corresponding raw content data (post content and topic content/prompts). It then performs an initial merge to combine the metrics with the clean content for each level.

Two levels of dataset: The post-level dataset contains individual student posts, while the topic-level dataset aggregates each discussion topic’s overall content creation and social engagement indicators.

# Load core metrics datasets: the datasets with metrics measuring content creation and social engagement. 
df_topic_metrics <- read.csv("data/df_topic_0904.csv")
df_post_metrics <- read.csv("data/df_post_0904.csv")
cat("✅ Loaded post-level and topic-level metrics data (CSV).\n")
## ✅ Loaded post-level and topic-level metrics data (CSV).
# Load content data (for both topics and posts): Topic content is discussion prompts, while post content is each individual post under discussion topics. 
## Topic content: only select the discussion topics for which we have topic metrics.
topic_content <- read_parquet("data/discussion_topic_assignment_19_24_cleaned_content_only.parquet.gzip")
topic_content <- topic_content%>%subset(discussion_topic_id%in%df_topic_metrics$discussion_topic_id)

## Post content: only select the discussion posts for which we have post metrics.
post_content <- read_parquet("data/discussion_post_assignment_19_24_cleaned_content_only.parquet.gzip")
post_content <- post_content%>%subset(discussion_post_id%in%df_post_metrics$discussion_post_id)
cat("✅ Loaded and filtered topic and post content data (Parquet).\n")
## ✅ Loaded and filtered topic and post content data (Parquet).
# Merge content data with metrics data: connect discussion prompts and posts with metrics
## df_post: Post metrics joined with clean post content.
df_post <- left_join(df_post_metrics, post_content, by = "discussion_post_id")

## df_topic: Topic metrics joined with clean discussion topic prompt content.
df_topic <- left_join(df_topic_metrics, topic_content, by = "discussion_topic_id")
cat("✅ Merging complete. Created df_post and df_topic.\n")
## ✅ Merging complete. Created df_post and df_topic.

2.2 Initial Data Inspection

# Calculate percentage of missing values per column in df_topic
cat("--- Missing Value Percentage in df_topic: !!! Rows with missing 'discussion_topic_content_clean' will be removed later for modeling ---\n")
## --- Missing Value Percentage in df_topic: !!! Rows with missing 'discussion_topic_content_clean' will be removed later for modeling ---
# Define the common text representations of missing values you want to exclude
missing_content_placeholders <- c("", "NA", "N/A", "NULL", "No message")

# Enhanced check for all forms of missingness in the key content column
total_missing_count <- sum(
  is.na(df_topic$discussion_topic_content_clean) |
  df_topic$discussion_topic_content_clean %in% missing_content_placeholders
)
total_missing_percentage <- (total_missing_count / nrow(df_topic)) * 100

cat("--- Enhanced Missing Value Percentage in df_topic ---\n")
## --- Enhanced Missing Value Percentage in df_topic ---
cat("Total rows (topics):", nrow(df_topic), "\n")
## Total rows (topics): 7667
cat(sprintf("Total missing/empty/placeholder content rows (Count): %d\n", total_missing_count))
## Total missing/empty/placeholder content rows (Count): 432
cat(sprintf("Total missing/empty/placeholder content rows (Percentage): %.2f%%\n", total_missing_percentage))
## Total missing/empty/placeholder content rows (Percentage): 5.63%
# Count the number of unique values (check duplicates) for each variable in df_topic
cat("--- Unique Value Count in df_topic ---\n")
## --- Unique Value Count in df_topic ---
df_topic_unique_counts <- df_topic %>%
  summarise(across(everything(), n_distinct))
print(df_topic_unique_counts)
##      X discussion_topic_id avg_similarity embedding_variance semantic_spread
## 1 7667                7667           7662               7663             353
##   MRS_depth thread_MRS_avg_depth post_count max_depth avg_depth max_width
## 1      6392                 1900        333         5      1247        36
##   avg_width total_users replied_users percent_replied avg_top_post_length
## 1      1245         214           118             958                5165
##   discussion_topic_title discussion_topic_content_clean
## 1                   4810                            909
cat("Note: With 7667 topic IDs, having 909 unique content entries confirms many topics reuse the same discussion prompt.\n")
## Note: With 7667 topic IDs, having 909 unique content entries confirms many topics reuse the same discussion prompt.
# ---Inspecting df_post: get a sense of variables in df_post that might be useful--
# Calculate percentage of missing values per column in df_post
cat("--- Missing Value Percentage in df_post ---\n")
## --- Missing Value Percentage in df_post ---
na_percentage_post <- colSums(is.na(df_post)) / nrow(df_post) * 100
print(na_percentage_post[na_percentage_post > 0])
##         parent_discussion_post_id                     user_group_id 
##                        54.8608849                        49.8264595 
##                  first_generation                              race 
##                         7.2504665                         3.8248924 
##                            gender                         term_code 
##                         0.3381953                         3.4131880 
##                      course_title                       school_code 
##                         3.4131880                       100.0000000 
##                school_name_abbrev                         dept_code 
##                       100.0000000                       100.0000000 
##                  dept_name_abbrev                      course_level 
##                         3.4131880                       100.0000000 
##                       course_type                     online_course 
##                         3.4131880                       100.0000000 
##                            uptake parent_discussion_post_created_at 
##                        56.0076794                        56.0752646 
##                 response_time_gap 
##                        56.0752646
cat("Note: 1. These variables have no data: school_code, school_name_abbrev, dept_code, course_level, online_course. 2. The variables(50–57% na_values) primarily related to post interaction/response have missing values structurally induced by the post_type: parent_discussion_post_id, uptake, parent_discussion_post_created_at, response_time_gap, user_group_id. 3. Variable with low missingness (< 8%): first_generation, race, dept_name_abbrev, term_code, course_title, course_type, gender.\n")
## Note: 1. These variables have no data: school_code, school_name_abbrev, dept_code, course_level, online_course. 2. The variables(50–57% na_values) primarily related to post interaction/response have missing values structurally induced by the post_type: parent_discussion_post_id, uptake, parent_discussion_post_created_at, response_time_gap, user_group_id. 3. Variable with low missingness (< 8%): first_generation, race, dept_name_abbrev, term_code, course_title, course_type, gender.
cat("---!!! Need to handle missing values in course_type and dept_name_abbrev---\n")
## ---!!! Need to handle missing values in course_type and dept_name_abbrev---
# Count the number of unique values for each variable in df_post
cat("\n--- Unique Value Count in df_post ---\n")
## 
## --- Unique Value Count in df_post ---
df_post_unique_counts <- df_post %>%
  summarise(across(everything(), n_distinct))
print(df_post_unique_counts)
##      X.1      X mellon_id canvas_course_id discussion_post_id
## 1 371383 371383     26474              746             371383
##   parent_discussion_post_id discussion_post_created_at
## 1                     91914                     368113
##   discussion_topic_posted_at discussion_topic_id user_group_id gpa_term
## 1                       3389                7667          1081      333
##   Analytic Clout Tone Authentic int_student citizenship_app first_generation
## 1     8560  8964 7119      8360           2               4                3
##   low_income first_language race gender term class_size term_code course_title
## 1          3             31    6      3    8        180         9          196
##   school_code school_name_abbrev dept_code dept_name_abbrev course_level
## 1           1                  1         1               53            1
##   course_type online_course informativeness politeness uptake formality
## 1          10             1           53674        460 153498    366046
##   word_cnt parent_discussion_post_created_at post_time_gap response_time_gap
## 1     1444                             89147        351511            162193
##   first_post_created_at post_type GPT discussion_post_content_clean
## 1                  7665         2   2                        370881

—Research Question —

RQ1: At the discussion topic level, how do students’ content creation and social engagement differ between cases where GPT use = 0 (no GenAI) and GPT use = 1 (GenAI used) Build formula with course-level controls + fixed effects for the full dataset formula <- as.formula( paste0(metric, ” ~ GPT_use + discussion_size_s + pct_international_s + pct_posts_s + course_type | discussion_topic_content_clean”)) RQ2: How do these impacts vary across different majors? Same formula as RQ1, but it will run through each field’s subset RQ3: How do these impacts vary across different discussion prompt types? Same formula as RQ1, but it will run through each discussion prompt’s subset

—Summary of key variables for next steps — Core Identifiers and Key Variables - ID: mellon_id (Student ID), discussion_topic_id (Grouping ID for aggregation: used to merge post-level and topic-level data) - Independent variable: GPT (0=No GPT time, 1=GPT time) - Post Metrics: Analytic, informativeness, word_cnt (These will be averaged per topic)

Control Variables - discussion_size: Number of unique students (mellon_id) within each discussion_topic_id → Calculate using df_post %>% group_by(discussion_topic_id) %>% summarise(discussion_size = n_distinct(mellon_id)) - course_type: Type of course (e.g., lecture, seminar) - int_student: Percentage of international students within each discussion topic → Calculate as mean(int_student == 1) per discussion_topic_id - post_type_ratio: Proportion of original posts relative to total posts (posts + replies) → pct_posts = number_of_posts / (number_of_posts + number_of_replies)

RQ2 dept_name_abbrev: will be used to subset fields

—All variables (for reference only) — - Content: discussion_post_content_clean - Course context variables: dept_name_abbrev, class_size, course_type, term, course_title - Identifiers: canvas_course_id, discussion_post_id, parent_discussion_post_id, , user_group_id, - Time-based variables: discussion_post_created_at, discussion_topic_posted_at, parent_discussion_post_created_at - Academic: gpa_term - Demographic Variables: int_student, citizenship_app, first_generation, low_income, first_language, race, gender. - Interaction: post_type, post_time_gap, response_time_gap, uptake, first_post_created_at - Content tone: Clout, Authentic, Tone, politeness, formality

2.3 Data Aggregation: Post-level to Topic-level

# Create topic-level aggregated variables from post-level data
topic_aggregated <- df_post %>%
  group_by(discussion_topic_id) %>%
  summarise(
    # Student and discussion characteristics
    discussion_size = n_distinct(mellon_id),
    pct_international = mean(int_student, na.rm = TRUE) * 100,
    
    # Percentage of Original Posts: The count of "Post" type contributions divided by the total count of "Post" and "Reply" types.
    pct_posts = 100 * sum(post_type == "Post", na.rm = TRUE) / 
                        (sum(post_type == "Post", na.rm = TRUE) + sum(post_type == "Reply", na.rm = TRUE)),
    
    # Content measures - averages per topic: Averaged measures of content length, depth, and width.
    avg_word_count = mean(word_cnt, na.rm = TRUE),
    avg_analytic = mean(Analytic, na.rm = TRUE),
    avg_informativeness = mean(informativeness, na.rm = TRUE),
    
    # GPT usage indicators
    GPT_use = as.numeric(any(GPT == 1, na.rm = TRUE)),

    # Course characteristics (taking first value since should be constant within topic). 
    ## While course_type and dept_name_abbrev have missing values, we will handle that later in df_combined just in case some miss values for certain rows can be found in others with the same topic_id.
    #class_size = first(class_size), # this is not necessary
    course_type = first(course_type),
    dept_name_abbrev = first(dept_name_abbrev),

    .groups = 'drop' # Important: Removes the grouping structure after summarization
  )
cat("✅ Successfully aggregated post-level data into 'topic_aggregated' dataframe.\n")
## ✅ Successfully aggregated post-level data into 'topic_aggregated' dataframe.
# Merge aggregated post-level metrics with topic-level data
df_combined <- left_join(topic_aggregated, df_topic, by = "discussion_topic_id")
cat("✅ Merged 'topic_aggregated' with 'df_topic'. The final analytical dataset is 'df_combined'.\n")
## ✅ Merged 'topic_aggregated' with 'df_topic'. The final analytical dataset is 'df_combined'.
cat(paste("Final Combined Dataset ('df_combined') has", nrow(df_combined), "rows (topics).\n"))
## Final Combined Dataset ('df_combined') has 7667 rows (topics).

2.4 Check Core Metrics

# Define content creation indicators
content_metrics <- c(
  "avg_similarity", "embedding_variance", "semantic_spread",
  "MRS_depth","avg_word_count","avg_analytic", "avg_informativeness"
)
# Define social engagement indicators
social_metrics <- c(
  "percent_replied", "avg_depth", "max_depth",
  "avg_width", "max_width"
)

# Combine into one master list
all_metrics <- c(content_metrics, social_metrics)
cat("✅ Content and Social Metrics lists defined.\n")
## ✅ Content and Social Metrics lists defined.
# Create a list of histogram plots
plot_list <- map(all_metrics, ~{
  ggplot(df_combined, aes(x = .data[[.x]])) +
    geom_histogram(bins = 30, fill = "#0072B2", color = "white", alpha = 0.8) +
    labs(
      title = paste("Distribution of", .x),
      x = .x,
      y = "Count"
    ) +
    theme_minimal(base_size = 10) +
    theme(
      plot.title = element_text(size = 5, face = "bold"),
      axis.title.x = element_text(size = 6), axis.title.y = element_text(size = 6),
      axis.text.x = element_text(angle = 45, hjust = 1, size = 6), axis.text.y = element_text(size = 6)
    )
})
# Combine and display the plots using patchwork
# We will arrange them into 3 rows of 4 columns for a compact view.
combined_plot <- wrap_plots(plot_list, ncol = 4, nrow = 3)

# Print the combined plot
print(combined_plot)

cat("--- Summary Statistics for Key Metrics in df_combined ---\n")
## --- Summary Statistics for Key Metrics in df_combined ---
# Use 'all_of()' inside select to explicitly handle character vector of column names
# Use unclass() or as.data.frame() before print() to ensure the 'n' argument works
df_combined_summary <- df_combined %>%
  dplyr::select(all_of(all_metrics)) %>%
  pivot_longer(cols = everything(), names_to = "metric", values_to = "value") %>%
  group_by(metric) %>%
  summarise(
    Mean = mean(value, na.rm = TRUE),
    SD = sd(value, na.rm = TRUE),
    Min = min(value, na.rm = TRUE),
    Max = max(value, na.rm = TRUE),
    Q99 = quantile(value, 0.99, na.rm = TRUE), # Check 99th percentile for outliers
    .groups = 'drop'
  )

# Convert to a standard data frame and print the result
print(df_combined_summary, n = 12)
## # A tibble: 12 × 6
##    metric                   Mean      SD        Min        Max       Q99
##    <chr>                   <dbl>   <dbl>      <dbl>      <dbl>     <dbl>
##  1 MRS_depth             0.266   5.43e-2  0.168        0.484     0.393  
##  2 avg_analytic         68.2     1.41e+1 13.4         98.4      94.5    
##  3 avg_depth             1.39    2.54e-1  1            2.02      1.79   
##  4 avg_informativeness   0.886   4.39e-2  0.701        1         1      
##  5 avg_similarity        0.525   1.46e-1  0.0876       0.966     0.806  
##  6 avg_width             0.383   2.50e-1  0            0.9       0.754  
##  7 avg_word_count      178.      1.36e+2 11.1       2251.      763.     
##  8 embedding_variance    0.00119 3.79e-4  0.0000824    0.00219   0.00199
##  9 max_depth             1.94    6.07e-1  1            5         3      
## 10 max_width             3.11    3.20e+0  0           45        15      
## 11 percent_replied       0.471   3.27e-1  0            1         0.938  
## 12 semantic_spread       1.00    7.59e-8  1.000        1.00      1.00
# # Apply CONSTANT MANUAL SCALING to specific metrics
# df_combined <- df_combined %>%
#   mutate(
#     # 1. Scale avg_word_count: Divide by 1000 (New unit: thousands of words)
#     avg_word_count = avg_word_count / 1000, 
#     # 2. Scale avg_analytic: Divide by 100 (New unit: hundredths of LIWC score)
#     # The max raw value is ~98, so the max scaled value will be ~0.98.
#     avg_analytic = avg_analytic / 100,
#     # 3. Scale max_width: Divide by 10 (New unit: tenths of post count)
#     # The max raw value is 45, so the max scaled value will be 4.5.
#     # max_width = max_width / 10
#  )
# Apply z-score standardization to all metrics
df_combined <- df_combined %>%
  mutate(across(all_of(all_metrics), ~scale(.)[,1]))

2.5: Cleaning and Recoding for Analysis

# Check missing values in the intermediate combined dataset
cat("--- Missing Value Count in df_combined (Before Cleaning) ---\n")
## --- Missing Value Count in df_combined (Before Cleaning) ---
na_df_combined <- colSums(is.na(df_combined))
print(na_df_combined[na_df_combined > 0])
##                    course_type               dept_name_abbrev 
##                            264                            264 
## discussion_topic_content_clean 
##                            392
cat("Note: Proceeding to clean missing values.\n")
## Note: Proceeding to clean missing values.
# Handle missing values
df_analysis <- df_combined %>%
  # remove rows where discussion_topic_content_clean is NA
  filter(
    # Exclude standard NA values
    !is.na(discussion_topic_content_clean) &
    # Exclude text placeholders/empty strings
    !(discussion_topic_content_clean %in% missing_content_placeholders)) %>%
  
  mutate(
    dept_name_abbrev = ifelse(is.na(dept_name_abbrev), "Unknown", dept_name_abbrev),
    course_type      = ifelse(is.na(course_type), "Unknown", course_type)
  )
cat("✅ Rows remaining after filtering NA topic content:", nrow(df_analysis), "\n")
## ✅ Rows remaining after filtering NA topic content: 7235
cat("--- Missing Value Count in df_analysis (After Cleaning) ---\n")
## --- Missing Value Count in df_analysis (After Cleaning) ---
colSums_na_df_analysis <- colSums(is.na(df_analysis))
print(colSums_na_df_analysis[colSums_na_df_analysis > 0]) 
## named numeric(0)
# Should show no NAs for key variables

2.6 Factorize and Standardize Variables

# Convert categorical variables to factors and Standardize numeric variables (z-score) for modeling or comparison
df_analysis_transformed <- df_analysis %>% 
  mutate(
    # Factorize categorical variables
    course_type = as.factor(course_type),
    discussion_topic_content_clean = as.factor(discussion_topic_content_clean),
    # Standardize continuous control variables (Z-score)
    discussion_size = scale(discussion_size)[,1],
    pct_international = scale(pct_international)[,1],
    pct_posts = scale(pct_posts)[,1]
    
  )
head(df_analysis_transformed)
## # A tibble: 6 × 27
##   discussion_topic_id discussion_size pct_international pct_posts avg_word_count
##                 <int>           <dbl>             <dbl>     <dbl>          <dbl>
## 1              593148           0.844             0.221      1.64        -0.325 
## 2              593152           3.54             -0.243      1.59        -0.283 
## 3              593154           3.34             -0.236      1.62         0.0721
## 4              593155           0.844             0.183      1.51        -0.209 
## 5              596691          -0.268             0.169      1.64         0.0627
## 6              596692          -0.323             0.299      1.64         0.0544
## # ℹ 22 more variables: avg_analytic <dbl>, avg_informativeness <dbl>,
## #   GPT_use <dbl>, course_type <fct>, dept_name_abbrev <chr>, X <int>,
## #   avg_similarity <dbl>, embedding_variance <dbl>, semantic_spread <dbl>,
## #   MRS_depth <dbl>, thread_MRS_avg_depth <dbl>, post_count <int>,
## #   max_depth <dbl>, avg_depth <dbl>, max_width <dbl>, avg_width <dbl>,
## #   total_users <int>, replied_users <int>, percent_replied <dbl>,
## #   avg_top_post_length <dbl>, discussion_topic_title <chr>, …
cat("✅ Created 'df_analysis_field_transformed': Categorical variables factorized and controls standardized.\n")
## ✅ Created 'df_analysis_field_transformed': Categorical variables factorized and controls standardized.
cat(paste("Final Analytical Dataset ('df_analysis_field_transformed') has", nrow(df_analysis_transformed), "rows.\n"))
## Final Analytical Dataset ('df_analysis_field_transformed') has 7235 rows.
# Only factorize variables 
df_analysis_factorized <- df_analysis %>%  
  mutate(
    course_type = as.factor(course_type),
    discussion_topic_content_clean = as.factor(discussion_topic_content_clean),
  )
head(df_analysis_factorized)
## # A tibble: 6 × 27
##   discussion_topic_id discussion_size pct_international pct_posts avg_word_count
##                 <int>           <int>             <dbl>     <dbl>          <dbl>
## 1              593148              59              22.0     100          -0.325 
## 2              593152             156              13.2      98.7        -0.283 
## 3              593154             149              13.3      99.3         0.0721
## 4              593155              59              21.3      96.7        -0.209 
## 5              596691              19              21.1     100           0.0627
## 6              596692              17              23.5     100           0.0544
## # ℹ 22 more variables: avg_analytic <dbl>, avg_informativeness <dbl>,
## #   GPT_use <dbl>, course_type <fct>, dept_name_abbrev <chr>, X <int>,
## #   avg_similarity <dbl>, embedding_variance <dbl>, semantic_spread <dbl>,
## #   MRS_depth <dbl>, thread_MRS_avg_depth <dbl>, post_count <int>,
## #   max_depth <dbl>, avg_depth <dbl>, max_width <dbl>, avg_width <dbl>,
## #   total_users <int>, replied_users <int>, percent_replied <dbl>,
## #   avg_top_post_length <dbl>, discussion_topic_title <chr>, …

2.7 Dataset for RQ2 (Grouping Departments)

This step creates the field variable needed to subset the data for RQ2.

# Inspect Unique Departments
unique_dept <- unique(df_analysis$dept_name_abbrev)
cat("departments:", unique_dept, "\n")
## departments: COMLIT SOCSCI FRENCH Unknown PHRMSCI MGMT WRITING ACENG MUSIC PUBHLTH LSCI CRM/LAW ECON PHILOS ENGLISH PSCI EDUC HISTORY DRAMA ASIANAM UNIAFF NURSCI COMPSCI GERMAN PHYSICS SOCIOL FLM&MDA BIOSCI ART ENGR ARABIC DANCE POLSCI SOCECOL I&CSCI UNISTU EAS EARTHSS SPANISH UPPP ENGRMAE IN4MATX HUMAN CHC/LAT PSYCH INTLST SPPS MATH
# Create 'field' Variables: Language, Art, Humanities, Social Science, Theoretical STEM, Applied STEM, Others  
df_analysis_field <- df_analysis_transformed %>%
  mutate(field = case_when(
    dept_name_abbrev %in% c("FRENCH","ENGLISH","GERMAN","ARABIC","SPANISH", "ACENG") ~ "Language",
    dept_name_abbrev %in% c("MUSIC","ART","DRAMA","DANCE") ~ "Arts",
    dept_name_abbrev %in% c("COMLIT","FLM&MDA","WRITING","PHILOS","HISTORY",
                            "ASIANAM","EAS","HUMAN") ~ "Humanities",
    dept_name_abbrev %in% c("CHC/LAT","SOCSCI","CRM/LAW","ANTHRO","ECON","PSCI",
                            "SOCIOL","POLSCI","SOCECOL","UPPP","INTLST","LSCI",
                            "PSYCH","SPPS","MGMT","EDUC") ~ "Social Sciences",
    dept_name_abbrev %in% c("PHYSICS","EARTHSS","MATH","BIOSCI") ~ "Theoretical STEM",
    dept_name_abbrev %in% c("COMPSCI","I&CSCI","IN4MATX","ENGR","ENGRMAE",
                            "PHRMSCI","PUBHLTH","NURSCI") ~ "Applied STEM",
    dept_name_abbrev %in% c("UNIAFF","UNISTU") ~ "Others",
    TRUE ~ "Unknown"   # catch-all for missing or uncategorized departments
  ))
cat("✅ Created field classification variable 'field'.\n")
## ✅ Created field classification variable 'field'.
df_analysis_field <- df_analysis_field %>%  
  mutate(field = as.factor(field))

# Split the data frame dept_name_abbrev into a list of data frames. 
field_subsets_all <- split(df_analysis_field, df_analysis_field$field)

# Will drop Unknown and Others field subset from this list
field_subsets <- field_subsets_all[names(field_subsets_all) != "Unknown" & names(field_subsets_all) != "Others"]

# To view the names of all the created subsets:
cat("Subset Names (for iteration in regression):", names(field_subsets),"\n")
## Subset Names (for iteration in regression): Applied STEM Arts Humanities Language Social Sciences Theoretical STEM
# Confirming the names of the subsets created
cat("✅ Data successfully split into", length(field_subsets), "subsets.\n")
## ✅ Data successfully split into 6 subsets.
# Check Subset Sizes: Calculate the number of topics (rows) in each field subset.
subset_sizes <- sapply(field_subsets, nrow)
# Print the table of subset sizes for review.
cat("Field Subset Sizes (Number of Discussion Topics)\n")
## Field Subset Sizes (Number of Discussion Topics)
print(subset_sizes)
##     Applied STEM             Arts       Humanities         Language 
##             2056              133             1061              203 
##  Social Sciences Theoretical STEM 
##             3414               73
# Assign individual subsets to new variables (#Language, Art, Humanities, Social Science, Theoretical STEM, Applied STEM, Others)
# list2env(field_subsets, envir = .GlobalEnv) 
# language_df <- field_subsets$`Language`
# art_df <- field_subsets$`Art`
# humanities_df <- field_subsets$`Humanities`
# social_sciences_df <- field_subsets$`Social Sciences`
# theoretical_stem_df <- field_subsets$`Theoretical STEM`
# applied_stem_df <- field_subsets$`Applied STEM`
# others_df <- field_subsets$`Others`
# head(language_df)

2.8 Data for RQ3 (Prompt Type Subsets)

—For RQ 3, we classified discussion prompts into different types using multiple frameworks. 1) knowledge types: “Metacognitive”, “Procedural”, “Conceptual”, “Factual”; 2) cognitive types: “Create”, “Apply”, “Evaluate”, “Analyze”, “Remember”, “Understand”; 3) purpose types: “academic”, “social”, “logistics”; 4) academic types: “knowledge_reproduction”, “open_qa”, “synthesis_application”, “personal_connection”, “collaborative_interaction” —

2.8.1 Data for knowledge and cognitive types

# Load data with knowledge and cognitive types
df_knowledge_cognitive <- read.csv("data/knowledge_cognitive.csv")
cat("✅ Data successfully loaded! Columns available:\n")
## ✅ Data successfully loaded! Columns available:
print(names(df_knowledge_cognitive))
## [1] "Unnamed..0"                     "discussion_topic_id"           
## [3] "discussion_topic_title"         "discussion_topic_content_clean"
## [5] "Pred_Knowledge_Type"            "Pred_Cognitive_Process"
cat("columns discussion_topic_id, Pred_Knowledge_Type, Pred_Cognitive_Process will be added to main dataset\n")
## columns discussion_topic_id, Pred_Knowledge_Type, Pred_Cognitive_Process will be added to main dataset
# Display unique values in the "Pred_Knowledge_Type" column
unique(df_knowledge_cognitive$Pred_Knowledge_Type)
## [1] "Metacognitive" "Procedural"    "Conceptual"    "Factual"      
## [5] ""
# Display unique values in the "Pred_Cognitive_Process" column
unique(df_knowledge_cognitive$Pred_Cognitive_Process)
## [1] "Create"     "Apply"      "Evaluate"   "Analyze"    "Remember"  
## [6] "Understand" ""
# Given we see "" from unique value check, Check overall missingness before cleaning
# Define the common text representations of missing values
missing_strings <- c("", "NA", "N/A", "NULL", "Unknown", "No message")

# Enhanced check covering NA, empty strings, and common missing text
cat("=== Enhanced Missing Value Summary (All Forms) ===\n")
## === Enhanced Missing Value Summary (All Forms) ===
enhanced_missing_summary <- sapply(
 df_knowledge_cognitive[, c("Pred_Knowledge_Type", "Pred_Cognitive_Process", "discussion_topic_content_clean")], function(x) sum(is.na(x) | x %in% missing_strings)
)
print(enhanced_missing_summary)
##            Pred_Knowledge_Type         Pred_Cognitive_Process 
##                            432                            432 
## discussion_topic_content_clean 
##                            432
cat("=== Clean data and Validation ===\n")
## === Clean data and Validation ===
# Clean the dataframe: Remove rows with missing_strings.
df_knowledge_cognitive_clean <- df_knowledge_cognitive %>%
  filter(
    # Check for non-missing/non-placeholder in Pred_Knowledge_Type
    !(Pred_Knowledge_Type %in% missing_strings))

# Validation: Check for remaining missing values in the cleaned subset
enhanced_missing_summary_clean <- sapply(
  df_knowledge_cognitive_clean[, c("Pred_Knowledge_Type", "Pred_Cognitive_Process", "discussion_topic_content_clean")],
  function(x) sum(is.na(x) | x %in% missing_strings | x == "")
)

# Should show 0 for the key columns
print(enhanced_missing_summary_clean)
##            Pred_Knowledge_Type         Pred_Cognitive_Process 
##                              0                              0 
## discussion_topic_content_clean 
##                              0
cat("=== Validation: No missing values confirmed ===\n")
## === Validation: No missing values confirmed ===
# Select only the necessary columns from df_knowledge_cognitive_clean data
df_prompt_types_to_merge <- df_knowledge_cognitive_clean %>%
  dplyr::select(discussion_topic_id, Pred_Knowledge_Type, Pred_Cognitive_Process)

# Left join these columns to the main analytical dataframe
df_analysis_prompt_types <- left_join(
  df_analysis_field,
  df_prompt_types_to_merge,
  by = "discussion_topic_id"
)

# Factorize the new categorical columns for consistency
df_analysis_prompt_types <- df_analysis_prompt_types %>%
  mutate(
    Pred_Knowledge_Type = as.factor(Pred_Knowledge_Type),
    Pred_Cognitive_Process = as.factor(Pred_Cognitive_Process)
  )

cat("✅ Merged 'Pred_Knowledge_Type' and 'Pred_Cognitive_Process' into 'df_analysis_prompt_types'.\n")
## ✅ Merged 'Pred_Knowledge_Type' and 'Pred_Cognitive_Process' into 'df_analysis_prompt_types'.
cat(paste("New dataset has", nrow(df_analysis_prompt_types), "rows.\n"))
## New dataset has 7235 rows.
# Define the levels based on the research question
knowledge_levels <- c("Metacognitive", "Procedural", "Conceptual", "Factual")
cognitive_levels <- c("Create", "Apply", "Evaluate", "Analyze", "Remember", "Understand")
# --- A. Create Knowledge Subsets ---
# Split the filtered data frame into a list of data frames based on Pred_Knowledge_Type
knowledge_subsets <- split(df_analysis_prompt_types, df_analysis_prompt_types$Pred_Knowledge_Type)

# --- B. Create Cognitive Subsets ---
# Split the filtered data frame into a list of data frames based on Pred_Cognitive_Process
cognitive_subsets <- split(df_analysis_prompt_types, df_analysis_prompt_types$Pred_Cognitive_Process)

# --- Validation: Check Subset Sizes ---
cat("Knowledge Subset Sizes (Number of Discussion Topics)\n")
## Knowledge Subset Sizes (Number of Discussion Topics)
knowledge_subset_sizes <- sapply(knowledge_subsets, nrow)
print(knowledge_subset_sizes)
##    Conceptual       Factual Metacognitive    Procedural 
##          3803           335          2874           223
cat("Cognitive Subset Sizes (Number of Discussion Topics)\n")
## Cognitive Subset Sizes (Number of Discussion Topics)
cognitive_subset_sizes <- sapply(cognitive_subsets, nrow)
print(cognitive_subset_sizes)
##    Analyze      Apply     Create   Evaluate   Remember Understand 
##       1160        214       1579       3482         74        726
cat("\n✅ Created 'knowledge_subsets' (4 subsets) and 'cognitive_subsets' (6 subsets).\n")
## 
## ✅ Created 'knowledge_subsets' (4 subsets) and 'cognitive_subsets' (6 subsets).

2.8.2 Data for Purpose-Type

# Load and Process Purpose-Type Data
df_purpose_type <- read.csv("data/purpose_type.csv")
cat("✅ Purpose-type data loaded successfully! Columns available:\n")
## ✅ Purpose-type data loaded successfully! Columns available:
print(names(df_purpose_type))
## [1] "discussion_topic_id" "QUESTION"            "Bloom_Level"        
## [4] "cleaned_text"        "academic"            "social"             
## [7] "logistics"
# Enhanced check covering NA, empty strings, and common missing text
cat("=== Enhanced Missing Value Summary (All Forms) ===\n")
## === Enhanced Missing Value Summary (All Forms) ===
enhanced_missing_summary1 <- sapply(
 df_purpose_type[, c("academic", "social", "logistics", "cleaned_text")],
 function(x) sum(is.na(x) | x %in% missing_strings)
)
print(enhanced_missing_summary1)
##     academic       social    logistics cleaned_text 
##            0            0            0          432
# df_analysis_field has 7235 obervations, and df_purpose_type has 7667 observations. Both dataframes has discussion_topic_id that can be used to do merger. I want to 1) add three columns (academic", "social", "logistics) from df_purpose_type to df_analysis_field; 2) name the updated dataframe as df_analysis_purposetype; 3) check the size of df_analysis_purposetype; 4) Create subsets for academic, social, and logistics (value == 1) to ensure things liks the academic_subset only has rows with academic == 1; 5) check the size of each subset.

# 0. Select only the ID and the three purpose type columns from df_purpose_type
df_purpose_cols <- df_purpose_type %>%
  dplyr::select(discussion_topic_id, academic, social, logistics)

# 1. Add three columns from df_purpose_cols to df_analysis_field using a left join
df_analysis_purposetype <- left_join(df_analysis_field, df_purpose_cols, by = "discussion_topic_id")

# 2. Check the size of df_analysis_purposetype
cat("✅ Merged dataset 'df_analysis_purposetype' created.\n")
## ✅ Merged dataset 'df_analysis_purposetype' created.
cat("3. Size of df_analysis_purposetype:", nrow(df_analysis_purposetype), "rows.\n")
## 3. Size of df_analysis_purposetype: 7235 rows.
cat("Note: The number of rows remains 7235, matching df_analysis_field.\n")
## Note: The number of rows remains 7235, matching df_analysis_field.
# 4. Create subsets for academic, social, and logistics (value == 1)
academic_subsets <- df_analysis_purposetype %>% filter(academic == 1)
social_subsets <- df_analysis_purposetype %>% filter(social == 1)
logistics_subsets <- df_analysis_purposetype %>% filter(logistics == 1)
 
# 5. Check the size of each subset
cat("✅ Purpose-type subsets created successfully.\n")
## ✅ Purpose-type subsets created successfully.
cat("Academic subset rows: ", nrow(academic_subsets), "\n")
## Academic subset rows:  5938
cat("Social subset rows:   ", nrow(social_subsets), "\n")
## Social subset rows:    875
cat("Logistics subset rows:", nrow(logistics_subsets), "\n")
## Logistics subset rows: 1159
cat("Verification: Checking if 'academic_subset' only contains rows where academic == 1:\n")
## Verification: Checking if 'academic_subset' only contains rows where academic == 1:
unique(academic_subsets$academic)
## [1] 1
unique(social_subsets$social)
## [1] 1
unique(logistics_subsets$logistics)
## [1] 1
cat("Note: confirmed\n")
## Note: confirmed
cat("\n🎉 All subsets for Purpose Types created and verified successfully!\n")
## 
## 🎉 All subsets for Purpose Types created and verified successfully!

2.8.3 Data for Academic-Type

# Load and Process Academic-Type Data
df_academic_type <- read.csv("data/academic_type.csv")

# Rename columns for cleaner, shorter names
df_academic_type <- df_academic_type %>%
  rename(
    knowledge_reproduction = annotator_openai_knowledge_reproduction,
    open_qa = annotator_openai_open_qa,
    synthesis_application = annotator_openai_synthesis_application,
    personal_connection = annotator_openai_personal_connection,
    collaborative_interaction = annotator_openai_collaborative_interaction
  )
cat("✅ Column names renamed successfully:\n")
## ✅ Column names renamed successfully:
print(names(df_academic_type))
##  [1] "Unnamed..0"                     "discussion_topic_id"           
##  [3] "discussion_topic_title"         "discussion_topic_content_clean"
##  [5] "QUESTION"                       "Bloom_Level"                   
##  [7] "cleaned_text"                   "academic"                      
##  [9] "social"                         "logistics"                     
## [11] "openai_raw_output"              "knowledge_reproduction"        
## [13] "open_qa"                        "synthesis_application"         
## [15] "personal_connection"            "collaborative_interaction"
# Check for missing values in key columns
cat("=== Missing Value Summary (Academic-Type Data) ===\n")
## === Missing Value Summary (Academic-Type Data) ===
missing_summary_academic <- sapply(
  df_academic_type[, c(
    "knowledge_reproduction",
    "open_qa",
    "synthesis_application",
    "personal_connection",
    "collaborative_interaction"
  )],
  function(x) sum(is.na(x))
)
print(missing_summary_academic)
##    knowledge_reproduction                   open_qa     synthesis_application 
##                         0                         0                         0 
##       personal_connection collaborative_interaction 
##                         0                         0
# === Merge df_academic_type with df_analysis_field ===
# 0. Select only the ID and the three purpose type columns from df_purpose_type
df_academic_cols <- df_academic_type %>%
  dplyr::select(discussion_topic_id, 
                knowledge_reproduction, 
                open_qa, 
                synthesis_application, 
                personal_connection, 
                collaborative_interaction)

# 1. Add the five academic columns to df_analysis_field using a left join
df_analysis_academictype <- left_join(df_analysis_field, df_academic_cols, by = "discussion_topic_id") 

# 2. Check the size of df_analysis_academictype
cat("✅ Merged dataset 'df_analysis_academictype' created.\n")
## ✅ Merged dataset 'df_analysis_academictype' created.
cat("3. Size of df_analysis_academictype:", nrow(df_analysis_academictype), "rows.\n")
## 3. Size of df_analysis_academictype: 7235 rows.
cat("Note: The number of rows remains 7235, matching df_analysis_field.\n")
## Note: The number of rows remains 7235, matching df_analysis_field.
# 4. Create subsets for academic types (value == 1)
knowledge_reproduction_subsets <- df_analysis_academictype %>% filter(knowledge_reproduction == 1)
open_qa_subsets <- df_analysis_academictype %>% filter(open_qa == 1)
synthesis_application_subsets <- df_analysis_academictype %>% filter(synthesis_application == 1)
personal_connection_subsets <- df_analysis_academictype %>% filter(personal_connection == 1)
collaborative_interaction_subsets <- df_analysis_academictype %>% filter(collaborative_interaction == 1)

# 5. Check the size of each subset
cat("✅ Academic-type subsets created successfully.\n")
## ✅ Academic-type subsets created successfully.
cat("knowledge_reproduction subset rows: ", nrow(knowledge_reproduction_subsets), "\n")
## knowledge_reproduction subset rows:  165
cat("open_qa_subset rows:   ", nrow(open_qa_subsets), "\n")
## open_qa_subset rows:    176
cat("synthesis_application_subset rows:", nrow(synthesis_application_subsets), "\n")
## synthesis_application_subset rows: 3784
cat("personal_connection_subset rows:   ", nrow(personal_connection_subsets), "\n")
## personal_connection_subset rows:    3427
cat("collaborative_interaction_subset rows:", nrow(collaborative_interaction_subsets), "\n")
## collaborative_interaction_subset rows: 3925
# cat("Verification: Checking if 'collaborative_interaction_subset' only contains rows where collaborative_interaction == 1:\n")
# collaborative_check <- table(personal_connection_subsets$personal_connection)
# collaborative_check <- table(personal_connection_subsets$personal_connection)
# 
# cat("Note: The table confirms that all rows in the subset have the required value of 1.\n")
# cat("🎉 All academic-type subsets created and verified successfully!\n")

2.9 Create named lists for easy iteration during the regression analysis for RQ3.

# Combine the  knowledge and cognitive subsets into a list
kc_subsets_list <- list(
  Conceptual = knowledge_subsets$Conceptual,
  Factual = knowledge_subsets$Factual,
  Metacognitive = knowledge_subsets$Metacognitive,
  Procedural = knowledge_subsets$Procedural,
  Remember = cognitive_subsets$Remember,
  Understand = cognitive_subsets$Understand,
  Analyze = cognitive_subsets$Analyze,
  Apply = cognitive_subsets$Apply,
  Evaluate = cognitive_subsets$Evaluate,
  Create = cognitive_subsets$Create
)

# Combine the individual purpose-type subsets into a list
purpose_subsets_list <- list(
  academic_purpose = academic_subsets,
  social_purpose = social_subsets,
  logistics_purpose = logistics_subsets
)

# Combine the individual academic-type subsets into a list
academic_subsets_list <- list(
  knowledge_reproduction = knowledge_reproduction_subsets,
  open_qa = open_qa_subsets,
  synthesis_application = synthesis_application_subsets,
  personal_connection = personal_connection_subsets,
  collaborative_interaction = collaborative_interaction_subsets
)

# Combine both lists into a single named list for easy iteration in RQ3
rq3_subsets_combined <- c(kc_subsets_list, purpose_subsets_list, academic_subsets_list)

cat("✅ Combined all purpose and academic subsets into 'rq3_subsets_combined'.\n")
## ✅ Combined all purpose and academic subsets into 'rq3_subsets_combined'.
cat("--- Structure of the Combined RQ3 Subsets ---\n")
## --- Structure of the Combined RQ3 Subsets ---
cat("Total number of subsets:", length(rq3_subsets_combined), "\n")
## Total number of subsets: 18
cat("Names of the subsets (for iteration):\n")
## Names of the subsets (for iteration):
print(names(rq3_subsets_combined))
##  [1] "Conceptual"                "Factual"                  
##  [3] "Metacognitive"             "Procedural"               
##  [5] "Remember"                  "Understand"               
##  [7] "Analyze"                   "Apply"                    
##  [9] "Evaluate"                  "Create"                   
## [11] "academic_purpose"          "social_purpose"           
## [13] "logistics_purpose"         "knowledge_reproduction"   
## [15] "open_qa"                   "synthesis_application"    
## [17] "personal_connection"       "collaborative_interaction"

— 4. Regression —

4.1 RQ1 - overall change (topic level)

# Initialize results dataframe
results_df <- data.frame(
  metric = character(),
  coef_GPT = numeric(),
  se_GPT = numeric(),
  t_value = numeric(),
  p_value = numeric(),
  sigsig = character(),
  stringsAsFactors = FALSE
)

# Loop through all metrics
for (metric in all_metrics) {
  # Build formula with course-level controls + fixed effects
  formula <- as.formula(
    paste0(metric, " ~ GPT_use + discussion_size + pct_international + pct_posts + course_type | discussion_topic_content_clean")
  )
  # Run regression with fixed effects
  model <- feols(formula, data = df_analysis_field)
  # Extract GPT coefficient info
  coef_summary <- summary(model)$coeftable
  gpt_row <- coef_summary["GPT_use", ]
  # Determine significance
  sig <- case_when(
    gpt_row["Pr(>|t|)"] <= 0.001 ~ "***",
    gpt_row["Pr(>|t|)"] <= 0.01  ~ "**",
    gpt_row["Pr(>|t|)"] <= 0.05  ~ "*",
    TRUE                         ~ ""
  )
  
  # Add to results dataframe
  results_df <- results_df %>% 
    add_row(
      metric   = metric,
      coef_GPT = gpt_row["Estimate"],
      se_GPT   = gpt_row["Std. Error"],
      t_value  = gpt_row["t value"],
      p_value  = gpt_row["Pr(>|t|)"],
      sigsig   = sig
    )
}

# View summary table
results_df
##                 metric      coef_GPT      se_GPT     t_value      p_value
## 1       avg_similarity  0.0071345695 0.012862353  0.55468618 5.792464e-01
## 2   embedding_variance -0.0004597206 0.012500783 -0.03677535 9.706722e-01
## 3      semantic_spread -0.0485844277 0.023402998 -2.07599159 3.817657e-02
## 4            MRS_depth  0.0947692161 0.025907924  3.65792394 2.689503e-04
## 5       avg_word_count  0.0311344768 0.013929973  2.23507088 2.565600e-02
## 6         avg_analytic  0.1739144289 0.024718008  7.03594034 3.905403e-12
## 7  avg_informativeness  0.0054446573 0.027698550  0.19656832 8.442095e-01
## 8      percent_replied  0.0513754730 0.008865892  5.79473258 9.442316e-09
## 9            avg_depth  0.0043794416 0.003437259  1.27410858 2.029520e-01
## 10           max_depth  0.0266836813 0.016604079  1.60705576 1.083909e-01
## 11           avg_width  0.0032933809 0.003078169  1.06991554 2.849426e-01
## 12           max_width -0.0543230773 0.012907656 -4.20859357 2.825887e-05
##    sigsig
## 1        
## 2        
## 3       *
## 4     ***
## 5       *
## 6     ***
## 7        
## 8     ***
## 9        
## 10       
## 11       
## 12    ***
# Define the custom order for the Y-axis metrics
custom_metric_order <- c(
  "avg_similarity", 
  "embedding_variance", 
  "semantic_spread",
  "avg_word_count",
  "MRS_depth", 
  "avg_analytic", 
  "avg_informativeness",
  "percent_replied", 
  "avg_depth",
  "max_depth",
  "avg_width", 
  "max_width" 
)
# Add a category variable for coloring the metrics
results_df <- results_df %>%
  mutate(
    category = case_when(
      metric %in% content_metrics ~ "Content Creation",
      metric %in% social_metrics ~ "Social Engagement"),
    # Calculate lower and upper confidence bounds (using t-value * SE for ~95% CI)
    lower_ci = coef_GPT - (1.96 * se_GPT),
    upper_ci = coef_GPT + (1.96 * se_GPT),
    # Reorder metrics based on coefficient value within each category
    metric_ordered = factor(metric, levels = rev(custom_metric_order)))

# Create the Forest Plot
ggplot(results_df, aes(x = coef_GPT, y = metric_ordered, color = category)) +
  geom_vline(xintercept = 0, linetype = "dashed", alpha = 0.2) +
  # Plot confidence intervals
  geom_errorbarh(aes(xmin = lower_ci, xmax = upper_ci), height = 0.2, linewidth = 0.25) +
  # Plot the point estimate (coefficient)
  geom_point(size = 0.6) +
  # Add significance stars next to the points
  geom_text(aes(label = sigsig, x = coef_GPT),
            hjust = ifelse(results_df$coef_GPT > 0, -0.5, 1.5), # Adjust position based on coefficient sign
            vjust = 0.5, size = 2) +
  scale_color_manual(values = c("Content Creation" = "#0072B2", "Social Engagement" = "#D55E00")) +
  labs(
    title = "RQ1: Overall Effect of GPT Use on Discussion Metrics",
    x = "Coefficient of GPT_use (Fixed Effects)",
    y = "Metric",
    color = "Metric Category"
  ) +
  theme_minimal(base_size = 9) +
  theme(
    legend.position = "bottom",
    plot.title.position = "plot"
  )

4.2 RQ2 - subject matter (topic level)

# Initialize a list to store results for each field
results_list <- list()
# Loop through the list of dataframes
for (field_name in names(field_subsets)) {
  rq2_df <- field_subsets[[field_name]]
  
  # Initialize a results dataframe for the current field
  results_field_df <- data.frame(
    metric = character(),
    coef_GPT = numeric(),
    se_GPT = numeric(),
    t_value = numeric(),
    p_value = numeric(),
    sigsig = character(),
    stringsAsFactors = FALSE
  )
  
  # Loop through all metrics for the current dataframe
  for (metric in all_metrics) {
    # Build formula with fixed effects
    formula <- as.formula(
      paste0(metric, " ~ GPT_use + discussion_size + pct_international + pct_posts | discussion_topic_content_clean")
    )
    # Run regression on the current dataframe
    model <- feols(formula, data = rq2_df)
    # Extract GPT coefficient info
    coef_summary <- summary(model)$coeftable
    
    # Check if 'GPT_use' exists in the model results
    if ("GPT_use" %in% rownames(coef_summary)) {
      gpt_row <- coef_summary["GPT_use", ]
      
      # Determine significance
      sig <- case_when(
        gpt_row["Pr(>|t|)"] <= 0.001 ~ "***",
        gpt_row["Pr(>|t|)"] <= 0.01 ~ "**",
        gpt_row["Pr(>|t|)"] <= 0.05 ~ "*",
        TRUE ~ ""
      )
      
      # Add to the current field's results dataframe
      results_field_df <- results_field_df %>% 
        add_row(
          metric = metric,
          coef_GPT = gpt_row["Estimate"],
          se_GPT = gpt_row["Std. Error"],
          t_value = gpt_row["t value"],
          p_value = gpt_row["Pr(>|t|)"],
          sigsig = sig
        )
    }
  }
  # Store the complete results for the current field in the main list
  results_list[[field_name]] <- results_field_df
}
all_results <- bind_rows(results_list, .id = "field_name")
print(all_results)
##          field_name              metric      coef_GPT      se_GPT      t_value
## 1      Applied STEM      avg_similarity -0.0451781917 0.029415848 -1.535845280
## 2      Applied STEM  embedding_variance  0.0432741052 0.029320755  1.475886454
## 3      Applied STEM     semantic_spread -0.0202254652 0.045054367 -0.448912429
## 4      Applied STEM           MRS_depth  0.1603079561 0.069937815  2.292149901
## 5      Applied STEM      avg_word_count  0.1293759891 0.031304842  4.132778883
## 6      Applied STEM        avg_analytic  0.3938340107 0.065073421  6.052148545
## 7      Applied STEM avg_informativeness  0.1062934315 0.049587284  2.143562299
## 8      Applied STEM     percent_replied  0.0731801071 0.017189179  4.257335863
## 9      Applied STEM           avg_depth  0.0074884721 0.010000979  0.748773921
## 10     Applied STEM           max_depth  0.0540319267 0.028950341  1.866365801
## 11     Applied STEM           avg_width  0.0004911200 0.006928092  0.070888210
## 12     Applied STEM           max_width -0.0541088286 0.028071327 -1.927547946
## 13             Arts      avg_similarity  0.0229993517 0.037901749  0.606815048
## 14             Arts  embedding_variance -0.0248713903 0.037954599 -0.655293200
## 15             Arts     semantic_spread -0.3084767990 0.183636064 -1.679826894
## 16             Arts           MRS_depth  0.0626189614 0.041349575  1.514379822
## 17             Arts      avg_word_count  0.0311770837 0.037722031  0.826495369
## 18             Arts        avg_analytic -0.0200303371 0.054780096 -0.365649908
## 19             Arts avg_informativeness -0.1696486582 0.187796392 -0.903364842
## 20             Arts     percent_replied -0.0040963626 0.039151548 -0.104628367
## 21             Arts           avg_depth -0.0154325271 0.016288457 -0.947451743
## 22             Arts           max_depth -0.0662494006 0.106966786 -0.619345528
## 23             Arts           avg_width -0.0526399831 0.015203499 -3.462359668
## 24             Arts           max_width -0.1387845732 0.069098458 -2.008504636
## 25       Humanities      avg_similarity  0.0248728341 0.030130724  0.825497394
## 26       Humanities  embedding_variance -0.0283353499 0.028301466 -1.001197262
## 27       Humanities     semantic_spread -0.0132071243 0.059952489 -0.220293177
## 28       Humanities           MRS_depth -0.0694039210 0.042453703 -1.634814314
## 29       Humanities      avg_word_count -0.0467145194 0.031591304 -1.478714481
## 30       Humanities        avg_analytic  0.1292423038 0.024245835  5.330495134
## 31       Humanities avg_informativeness  0.0189776147 0.059270979  0.320183927
## 32       Humanities     percent_replied -0.0111851116 0.014109603 -0.792730427
## 33       Humanities           avg_depth -0.0021413971 0.010487188 -0.204191739
## 34       Humanities           max_depth -0.0018485530 0.039068429 -0.047315774
## 35       Humanities           avg_width -0.0051153325 0.007998829 -0.639510201
## 36       Humanities           max_width -0.0335449066 0.028234478 -1.188083119
## 37         Language      avg_similarity -0.0134422239 0.032158866 -0.417994334
## 38         Language  embedding_variance  0.0209997512 0.031171493  0.673684476
## 39         Language     semantic_spread -0.1283748419 0.161132511 -0.796703542
## 40         Language           MRS_depth  0.0028539730 0.005719933  0.498952133
## 41         Language      avg_word_count -0.0207429452 0.017883345 -1.159902982
## 42         Language        avg_analytic  0.0765590185 0.047931733  1.597251221
## 43         Language avg_informativeness  0.1329565333 0.103774961  1.281200508
## 44         Language     percent_replied  0.1670353554 0.045623930  3.661134777
## 45         Language           avg_depth  0.0625360938 0.027683343  2.258979086
## 46         Language           max_depth -0.0092794308 0.109647396 -0.084629742
## 47         Language           avg_width  0.1135685308 0.047990509  2.366478998
## 48         Language           max_width -0.0605862151 0.092633361 -0.654043152
## 49  Social Sciences      avg_similarity  0.0188464465 0.018132359  1.039381942
## 50  Social Sciences  embedding_variance -0.0058153596 0.017557675 -0.331214666
## 51  Social Sciences     semantic_spread -0.0587071667 0.036275358 -1.618375928
## 52  Social Sciences           MRS_depth  0.1175321049 0.027159024  4.327552656
## 53  Social Sciences      avg_word_count -0.0036722857 0.016101437 -0.228071923
## 54  Social Sciences        avg_analytic  0.1005016303 0.018570501  5.411896504
## 55  Social Sciences avg_informativeness -0.0230039549 0.039638296 -0.580346714
## 56  Social Sciences     percent_replied  0.0555110856 0.014446916  3.842417674
## 57  Social Sciences           avg_depth  0.0001907704 0.003982519  0.047901950
## 58  Social Sciences           max_depth  0.0319565218 0.026352112  1.212674043
## 59  Social Sciences           avg_width  0.0011974120 0.003787414  0.316155513
## 60  Social Sciences           max_width -0.0879287330 0.021658745 -4.059733580
## 61 Theoretical STEM      avg_similarity -0.1437075925 0.058897458 -2.439962575
## 62 Theoretical STEM  embedding_variance  0.1578668023 0.056647041  2.786849940
## 63 Theoretical STEM     semantic_spread  0.0572408250 0.302052397  0.189506276
## 64 Theoretical STEM           MRS_depth -0.0068639900 0.015804455 -0.434307281
## 65 Theoretical STEM      avg_word_count -0.0191746204 0.020657047 -0.928236272
## 66 Theoretical STEM        avg_analytic -0.0953836522 0.060928702 -1.565496214
## 67 Theoretical STEM avg_informativeness -0.2056456370 0.152018920 -1.352763443
## 68 Theoretical STEM     percent_replied  0.1489499932 0.038597390  3.859069052
## 69 Theoretical STEM           avg_depth  0.0072973534 0.022877205  0.318979232
## 70 Theoretical STEM           max_depth  0.3319884273 0.260907112  1.272439164
## 71 Theoretical STEM           avg_width -0.0183388395 0.017957736 -1.021222259
## 72 Theoretical STEM           max_width -0.0008636750 0.094105723 -0.009177709
##         p_value sigsig
## 1  1.280084e-01       
## 2  1.433889e-01       
## 3  6.545505e-01       
## 4  2.417766e-02      *
## 5  7.895583e-05    ***
## 6  3.057902e-08    ***
## 7  3.470570e-02      *
## 8  4.979934e-05    ***
## 9  4.559036e-01       
## 10 6.517573e-02       
## 11 9.436406e-01       
## 12 5.699626e-02       
## 13 5.473195e-01       
## 14 5.159391e-01       
## 15 1.005965e-01       
## 16 1.376013e-01       
## 17 4.133072e-01       
## 18 7.165061e-01       
## 19 3.716095e-01       
## 20 9.171809e-01       
## 21 3.489597e-01       
## 22 5.391154e-01       
## 23 1.265993e-03     **
## 24 5.120704e-02       
## 25 4.099715e-01       
## 26 3.178166e-01       
## 27 8.258444e-01       
## 28 1.034988e-01       
## 29 1.406278e-01       
## 30 2.394642e-07    ***
## 31 7.491287e-01       
## 32 4.287775e-01       
## 33 8.383899e-01       
## 34 9.623039e-01       
## 35 5.231475e-01       
## 36 2.360645e-01       
## 37 6.771299e-01       
## 38 5.025557e-01       
## 39 4.281051e-01       
## 40 6.192543e-01       
## 41 2.497193e-01       
## 42 1.143596e-01       
## 43 2.040188e-01       
## 44 4.615549e-04    ***
## 45 2.675177e-02      *
## 46 9.327783e-01       
## 47 2.050623e-02      *
## 48 5.150583e-01       
## 49 2.992209e-01       
## 50 7.406464e-01       
## 51 1.063270e-01       
## 52 1.883692e-05    ***
## 53 8.197005e-01       
## 54 1.046565e-07    ***
## 55 5.619899e-01       
## 56 1.405126e-04    ***
## 57 9.618170e-01       
## 58 2.259313e-01       
## 59 7.520404e-01       
## 60 5.853660e-05    ***
## 61 2.022697e-02      *
## 62 8.757261e-03     **
## 63 8.508569e-01       
## 64 6.668919e-01       
## 65 3.600251e-01       
## 66 1.270069e-01       
## 67 1.853278e-01       
## 68 5.010349e-04    ***
## 69 7.517526e-01       
## 70 2.121100e-01       
## 71 3.145778e-01       
## 72 9.927326e-01
# Prepare the data for plotting
all_results_plot <- all_results %>%
  mutate(
    category = case_when(
      metric %in% content_metrics ~ "Content Creation",
      metric %in% social_metrics ~ "Social Engagement"),
    lower_ci = coef_GPT - (1.96 * se_GPT),
    upper_ci = coef_GPT + (1.96 * se_GPT),
    metric = factor(metric, levels = rev(custom_metric_order)),
    # Ensure field is a factor in the desired order (optional)
    field_name = factor(field_name, levels = c("Applied STEM", "Theoretical STEM", "Social Sciences", "Humanities", "Language", "Arts"))
)
# Create the Grouped Forest Plot
ggplot(all_results_plot, aes(x = coef_GPT, y = metric, color = category)) +
  geom_vline(xintercept = 0, linetype = "dashed", alpha = 0.2) +
  
  # Plot confidence intervals and coefficient
  geom_errorbarh(aes(xmin = lower_ci, xmax = upper_ci), height = 0.2, linewidth = 0.15) +
  geom_point(size = 0.12) +
  
  # Add significance stars
  geom_text(aes(label = sigsig, x = coef_GPT),
            hjust = ifelse(all_results_plot$coef_GPT > 0, -0.2, 1.2),
            vjust = 0.5, size = 2) +
            
  scale_color_manual(values = c("Content Creation" = "#0072B2", "Social Engagement" = "#D55E00")) +
  
  facet_wrap(~ field_name, scales = "fixed", ncol = 3) + # Separate plot for each field
  
  labs(
    title = "RQ2: Effect of GPT Use on Discussion Metrics, by Field of Study",
    x = "Coefficient of GPT_use (Fixed Effects)",
    y = "Metric",
    color = "Metric Category"
  ) +
  theme_bw(base_size = 9) +
  theme(
    legend.position = "bottom",
    strip.background = element_rect(fill = "gray90"),
    plot.title.position = "plot"
  )

# Initialize a list to store results for each RQ3 subset
results_list_rq3 <- list()

# Loop through the list of RQ3 subsets
for (subset_name in names(rq3_subsets_combined)) {
  current_df_rq3 <- rq3_subsets_combined[[subset_name]]
  
  # Initialize a results dataframe for the current subset
  results_subset_df <- data.frame(
    metric = character(),
    coef_GPT = numeric(),
    se_GPT = numeric(),
    t_value = numeric(),
    p_value = numeric(),
    sigsig = character(),
    stringsAsFactors = FALSE
  )
  
  # Loop through all metrics for the current dataframe
  for (metric in all_metrics) {
    # Build formula with fixed effects
    formula <- as.formula(
      paste0(metric, " ~ GPT_use + discussion_size + pct_international + pct_posts | discussion_topic_content_clean")
    )
    
    # Run regression on the current dataframe
    model <- feols(formula, data = current_df_rq3)
    
    # Extract GPT coefficient info
    coef_summary <- summary(model)$coeftable
    
    # Check if 'GPT_use' exists in the model results
    if ("GPT_use" %in% rownames(coef_summary)) {
      gpt_row <- coef_summary["GPT_use", ]
      
      # Determine significance
      sig <- case_when(
        gpt_row["Pr(>|t|)"] <= 0.001 ~ "***",
        gpt_row["Pr(>|t|)"] <= 0.01  ~ "**",
        gpt_row["Pr(>|t|)"] <= 0.05  ~ "*",
        TRUE                         ~ ""
      )
      
      # Add to the current subset's results dataframe
      results_subset_df <- results_subset_df %>% 
        add_row(
          metric   = metric,
          coef_GPT = gpt_row["Estimate"],
          se_GPT   = gpt_row["Std. Error"],
          t_value  = gpt_row["t value"],
          p_value  = gpt_row["Pr(>|t|)"],
          sigsig   = sig
        )
    }
  }
  
  # Store the complete results for the current subset in the main list
  results_list_rq3[[subset_name]] <- results_subset_df
}

# Combine all RQ3 subset results into one dataframe
all_results_rq3 <- bind_rows(results_list_rq3, .id = "subset_name")
print(all_results_rq3)
##                   subset_name              metric      coef_GPT      se_GPT
## 1                  Conceptual      avg_similarity  2.531389e-03 0.015382959
## 2                  Conceptual  embedding_variance  3.130661e-03 0.015020269
## 3                  Conceptual     semantic_spread -6.285074e-02 0.031736046
## 4                  Conceptual           MRS_depth  1.269359e-01 0.038307578
## 5                  Conceptual      avg_word_count  4.747419e-02 0.021037129
## 6                  Conceptual        avg_analytic  1.895984e-01 0.033216501
## 7                  Conceptual avg_informativeness  4.217785e-02 0.036645558
## 8                  Conceptual     percent_replied  5.172237e-02 0.011068057
## 9                  Conceptual           avg_depth  4.902173e-03 0.003025158
## 10                 Conceptual           max_depth  3.996510e-02 0.020532454
## 11                 Conceptual           avg_width  5.518516e-03 0.003336772
## 12                 Conceptual           max_width -6.382930e-02 0.016198296
## 13                    Factual      avg_similarity -6.938187e-03 0.045745229
## 14                    Factual  embedding_variance  8.642255e-03 0.042389004
## 15                    Factual     semantic_spread  4.820670e-02 0.102551413
## 16                    Factual           MRS_depth  8.613037e-02 0.045048271
## 17                    Factual      avg_word_count -1.249292e-02 0.028443272
## 18                    Factual        avg_analytic  6.725825e-02 0.036682178
## 19                    Factual avg_informativeness  6.900777e-02 0.056104804
## 20                    Factual     percent_replied  5.193187e-02 0.035104601
## 21                    Factual           avg_depth  9.729454e-03 0.012541486
## 22                    Factual           max_depth  1.211677e-01 0.075169000
## 23                    Factual           avg_width  1.818188e-02 0.021061312
## 24                    Factual           max_width -8.816521e-02 0.051243285
## 25              Metacognitive      avg_similarity  1.104270e-02 0.024122641
## 26              Metacognitive  embedding_variance -1.660771e-03 0.023352717
## 27              Metacognitive     semantic_spread -1.318054e-02 0.038427590
## 28              Metacognitive           MRS_depth  1.981767e-02 0.038634441
## 29              Metacognitive      avg_word_count  5.715306e-03 0.016572230
## 30              Metacognitive        avg_analytic  1.676989e-01 0.039124680
## 31              Metacognitive avg_informativeness -5.237807e-02 0.042983204
## 32              Metacognitive     percent_replied  4.173795e-02 0.015817950
## 33              Metacognitive           avg_depth  3.230395e-03 0.006157868
## 34              Metacognitive           max_depth -7.212578e-04 0.028966813
## 35              Metacognitive           avg_width -5.538945e-04 0.005042570
## 36              Metacognitive           max_width -1.103794e-02 0.022240858
## 37                 Procedural      avg_similarity  6.233362e-02 0.043637672
## 38                 Procedural  embedding_variance -5.849178e-02 0.042018808
## 39                 Procedural     semantic_spread -1.888966e-01 0.119989595
## 40                 Procedural           MRS_depth -1.346716e-01 0.099036356
## 41                 Procedural      avg_word_count  1.147371e-01 0.064970625
## 42                 Procedural        avg_analytic  3.025601e-02 0.047214510
## 43                 Procedural avg_informativeness -3.458931e-02 0.187301880
## 44                 Procedural     percent_replied -2.390325e-02 0.044983010
## 45                 Procedural           avg_depth -2.158465e-02 0.062302428
## 46                 Procedural           max_depth  9.047750e-02 0.090284129
## 47                 Procedural           avg_width -2.547193e-02 0.042948652
## 48                 Procedural           max_width  6.330911e-03 0.048332048
## 49                   Remember      avg_similarity  1.825470e-02 0.173180037
## 50                   Remember  embedding_variance  1.508534e-02 0.146532494
## 51                   Remember     semantic_spread  1.998034e-01 0.145979324
## 52                   Remember           MRS_depth  5.588751e-02 0.036194975
## 53                   Remember      avg_word_count -3.924629e-02 0.030897802
## 54                   Remember        avg_analytic -2.372769e-02 0.068086366
## 55                   Remember avg_informativeness  8.144050e-02 0.122521632
## 56                   Remember     percent_replied  9.129731e-03 0.034034374
## 57                   Remember           avg_depth  1.135166e-02 0.034208173
## 58                   Remember           max_depth  9.809983e-02 0.187675221
## 59                   Remember           avg_width  5.939667e-03 0.043004297
## 60                   Remember           max_width -2.493942e-02 0.053767846
## 61                 Understand      avg_similarity  2.874144e-02 0.021668629
## 62                 Understand  embedding_variance -3.304744e-02 0.021211718
## 63                 Understand     semantic_spread -4.185762e-02 0.081477092
## 64                 Understand           MRS_depth  3.878261e-02 0.039963072
## 65                 Understand      avg_word_count -3.394024e-02 0.018239536
## 66                 Understand        avg_analytic  8.865502e-02 0.022563489
## 67                 Understand avg_informativeness  2.184231e-02 0.053608783
## 68                 Understand     percent_replied  4.901041e-02 0.021980696
## 69                 Understand           avg_depth  1.306712e-02 0.006120530
## 70                 Understand           max_depth  2.244351e-02 0.054651812
## 71                 Understand           avg_width  1.758338e-02 0.010435395
## 72                 Understand           max_width -7.702887e-02 0.035915283
## 73                    Analyze      avg_similarity  1.328999e-02 0.023143457
## 74                    Analyze  embedding_variance -1.583466e-02 0.022549331
## 75                    Analyze     semantic_spread -5.940332e-02 0.058474535
## 76                    Analyze           MRS_depth -8.601259e-02 0.055320202
## 77                    Analyze      avg_word_count -4.248803e-02 0.030733076
## 78                    Analyze        avg_analytic  6.878766e-02 0.020434840
## 79                    Analyze avg_informativeness -4.731850e-02 0.064771729
## 80                    Analyze     percent_replied  7.260251e-03 0.015156126
## 81                    Analyze           avg_depth -4.498438e-04 0.004402931
## 82                    Analyze           max_depth -6.279540e-02 0.033992193
## 83                    Analyze           avg_width  1.572861e-03 0.005011922
## 84                    Analyze           max_width -3.261156e-02 0.023197842
## 85                      Apply      avg_similarity -2.390456e-03 0.043388879
## 86                      Apply  embedding_variance -9.062013e-03 0.040494490
## 87                      Apply     semantic_spread  1.376479e-02 0.119208862
## 88                      Apply           MRS_depth -1.387134e-01 0.117551626
## 89                      Apply      avg_word_count  5.316527e-02 0.040523055
## 90                      Apply        avg_analytic  5.361716e-02 0.050822345
## 91                      Apply avg_informativeness -2.165877e-02 0.165394484
## 92                      Apply     percent_replied -3.102590e-04 0.046450078
## 93                      Apply           avg_depth  2.746244e-03 0.018923065
## 94                      Apply           max_depth  6.879845e-02 0.097680558
## 95                      Apply           avg_width  8.491647e-03 0.024561511
## 96                      Apply           max_width  4.918955e-02 0.071408044
## 97                   Evaluate      avg_similarity -1.417655e-02 0.020318564
## 98                   Evaluate  embedding_variance  2.553848e-02 0.019621020
## 99                   Evaluate     semantic_spread -4.987853e-02 0.033863011
## 100                  Evaluate           MRS_depth  1.651973e-01 0.042645178
## 101                  Evaluate      avg_word_count  6.453134e-02 0.020734194
## 102                  Evaluate        avg_analytic  2.399010e-01 0.042032079
## 103                  Evaluate avg_informativeness  6.810347e-02 0.040811952
## 104                  Evaluate     percent_replied  8.190668e-02 0.013409916
## 105                  Evaluate           avg_depth  4.924346e-03 0.004319835
## 106                  Evaluate           max_depth  6.613770e-02 0.023214619
## 107                  Evaluate           avg_width  5.000044e-03 0.003423585
## 108                  Evaluate           max_width -5.810913e-02 0.017886575
## 109                    Create      avg_similarity  4.223489e-02 0.022668943
## 110                    Create  embedding_variance -3.285425e-02 0.021922399
## 111                    Create     semantic_spread -3.461674e-02 0.051165309
## 112                    Create           MRS_depth  3.933948e-02 0.032430891
## 113                    Create      avg_word_count  3.335922e-02 0.026573544
## 114                    Create        avg_analytic  1.485961e-01 0.042186255
## 115                    Create avg_informativeness -1.024676e-01 0.056970657
## 116                    Create     percent_replied  1.232573e-02 0.020975884
## 117                    Create           avg_depth  5.412446e-04 0.010448701
## 118                    Create           max_depth  1.382629e-02 0.030639811
## 119                    Create           avg_width -6.338086e-03 0.009048055
## 120                    Create           max_width -1.404213e-02 0.028787478
## 121          academic_purpose      avg_similarity  9.074311e-05 0.014619973
## 122          academic_purpose  embedding_variance  5.938514e-03 0.014252079
## 123          academic_purpose     semantic_spread -5.062477e-02 0.025512163
## 124          academic_purpose           MRS_depth  9.412378e-02 0.030718810
## 125          academic_purpose      avg_word_count  2.780500e-02 0.015501997
## 126          academic_purpose        avg_analytic  1.823023e-01 0.026080435
## 127          academic_purpose avg_informativeness  1.695409e-03 0.029892689
## 128          academic_purpose     percent_replied  4.987550e-02 0.010125311
## 129          academic_purpose           avg_depth  1.718419e-03 0.003618323
## 130          academic_purpose           max_depth  2.426550e-02 0.018735667
## 131          academic_purpose           avg_width  5.055129e-04 0.003076711
## 132          academic_purpose           max_width -4.246096e-02 0.013740186
## 133            social_purpose      avg_similarity  7.081662e-02 0.021936975
## 134            social_purpose  embedding_variance -5.938132e-02 0.020835991
## 135            social_purpose     semantic_spread  1.576654e-01 0.069927892
## 136            social_purpose           MRS_depth -1.055531e-01 0.056705482
## 137            social_purpose      avg_word_count -3.903140e-02 0.021389250
## 138            social_purpose        avg_analytic  8.175870e-02 0.025597045
## 139            social_purpose avg_informativeness  4.756487e-04 0.084832015
## 140            social_purpose     percent_replied  4.957204e-02 0.019191794
## 141            social_purpose           avg_depth  3.781138e-03 0.009673251
## 142            social_purpose           max_depth -2.140583e-02 0.038214563
## 143            social_purpose           avg_width  1.099134e-02 0.012969658
## 144            social_purpose           max_width -3.183582e-02 0.035164618
## 145         logistics_purpose      avg_similarity -1.353468e-02 0.020877881
## 146         logistics_purpose  embedding_variance  1.613385e-02 0.020464455
## 147         logistics_purpose     semantic_spread -5.500579e-02 0.066524659
## 148         logistics_purpose           MRS_depth  3.334903e-02 0.035404659
## 149         logistics_purpose      avg_word_count  6.442925e-02 0.034422915
## 150         logistics_purpose        avg_analytic  2.539181e-01 0.069746355
## 151         logistics_purpose avg_informativeness  5.471802e-02 0.056031629
## 152         logistics_purpose     percent_replied  7.199088e-03 0.017824708
## 153         logistics_purpose           avg_depth -1.691704e-02 0.011498671
## 154         logistics_purpose           max_depth  3.903285e-02 0.040202688
## 155         logistics_purpose           avg_width -1.485298e-02 0.010013254
## 156         logistics_purpose           max_width -4.192351e-02 0.045706259
## 157    knowledge_reproduction      avg_similarity  3.057160e-02 0.044581027
## 158    knowledge_reproduction  embedding_variance -3.667423e-02 0.042227807
## 159    knowledge_reproduction     semantic_spread  2.235642e-01 0.187078852
## 160    knowledge_reproduction           MRS_depth  4.840812e-02 0.045857348
## 161    knowledge_reproduction      avg_word_count -1.005250e-01 0.082455522
## 162    knowledge_reproduction        avg_analytic  9.226357e-02 0.044972933
## 163    knowledge_reproduction avg_informativeness -2.588848e-01 0.213085182
## 164    knowledge_reproduction     percent_replied -3.931596e-02 0.036276489
## 165    knowledge_reproduction           avg_depth -1.137908e-02 0.009675375
## 166    knowledge_reproduction           max_depth -2.161743e-01 0.136963724
## 167    knowledge_reproduction           avg_width -1.048937e-03 0.008248571
## 168    knowledge_reproduction           max_width -2.002576e-02 0.066847269
## 169                   open_qa      avg_similarity -1.166194e-02 0.060452359
## 170                   open_qa  embedding_variance  1.983962e-02 0.058136578
## 171                   open_qa     semantic_spread -1.184601e-01 0.200970518
## 172                   open_qa           MRS_depth -2.404695e-02 0.030438963
## 173                   open_qa      avg_word_count -1.066087e-02 0.030154185
## 174                   open_qa        avg_analytic -6.591060e-02 0.069458366
## 175                   open_qa avg_informativeness -9.383234e-02 0.081406388
## 176                   open_qa     percent_replied  1.144998e-01 0.035489111
## 177                   open_qa           avg_depth  8.675154e-03 0.026301278
## 178                   open_qa           max_depth  2.114916e-01 0.152351974
## 179                   open_qa           avg_width -2.766310e-02 0.026956595
## 180                   open_qa           max_width -2.848067e-01 0.085964500
## 181     synthesis_application      avg_similarity -2.874174e-02 0.020526602
## 182     synthesis_application  embedding_variance  3.139672e-02 0.020260867
## 183     synthesis_application     semantic_spread -6.672425e-02 0.029475262
## 184     synthesis_application           MRS_depth  8.448195e-02 0.043286847
## 185     synthesis_application      avg_word_count  4.843257e-02 0.020430706
## 186     synthesis_application        avg_analytic  2.163614e-01 0.036051812
## 187     synthesis_application avg_informativeness  2.937099e-02 0.036765557
## 188     synthesis_application     percent_replied  4.269254e-02 0.012530738
## 189     synthesis_application           avg_depth  4.582789e-03 0.004955156
## 190     synthesis_application           max_depth  5.433943e-02 0.020288780
## 191     synthesis_application           avg_width  2.089875e-03 0.003603361
## 192     synthesis_application           max_width -1.587836e-02 0.016297927
## 193       personal_connection      avg_similarity  1.962604e-03 0.020639482
## 194       personal_connection  embedding_variance  6.929977e-03 0.019962218
## 195       personal_connection     semantic_spread -3.473864e-03 0.036084449
## 196       personal_connection           MRS_depth  9.082662e-02 0.039318138
## 197       personal_connection      avg_word_count  1.634237e-02 0.020161512
## 198       personal_connection        avg_analytic  2.004856e-01 0.039609916
## 199       personal_connection avg_informativeness  4.501023e-03 0.038738282
## 200       personal_connection     percent_replied  5.922759e-02 0.013104658
## 201       personal_connection           avg_depth  5.999611e-03 0.004700297
## 202       personal_connection           max_depth  1.129853e-02 0.025887450
## 203       personal_connection           avg_width  3.658704e-03 0.003592509
## 204       personal_connection           max_width -1.890573e-02 0.018703903
## 205 collaborative_interaction      avg_similarity -1.538006e-02 0.020214672
## 206 collaborative_interaction  embedding_variance  2.005004e-02 0.019890314
## 207 collaborative_interaction     semantic_spread -6.436680e-02 0.030650072
## 208 collaborative_interaction           MRS_depth  1.438568e-01 0.039381455
## 209 collaborative_interaction      avg_word_count  3.734275e-02 0.020682689
## 210 collaborative_interaction        avg_analytic  2.443956e-01 0.034086195
## 211 collaborative_interaction avg_informativeness  4.247767e-02 0.034348494
## 212 collaborative_interaction     percent_replied  6.788847e-02 0.012012129
## 213 collaborative_interaction           avg_depth -1.156034e-03 0.004971934
## 214 collaborative_interaction           max_depth  1.544803e-02 0.024319386
## 215 collaborative_interaction           avg_width -2.433068e-03 0.004182134
## 216 collaborative_interaction           max_width -5.315477e-02 0.019390284
##          t_value      p_value sigsig
## 1    0.164557983 8.693776e-01       
## 2    0.208429067 8.350033e-01       
## 3   -1.980421271 4.836214e-02      *
## 4    3.313597315 1.007612e-03     **
## 5    2.256685985 2.458346e-02      *
## 6    5.707958935 2.274429e-08    ***
## 7    1.150967699 2.504543e-01       
## 8    4.673120556 4.098144e-06    ***
## 9    1.620468721 1.059440e-01       
## 10   1.946435458 5.232390e-02       
## 11   1.653848687 9.896702e-02       
## 12  -3.940494755 9.643217e-05    ***
## 13  -0.151670173 8.799734e-01       
## 14   0.203879640 8.391612e-01       
## 15   0.470073437 6.400653e-01       
## 16   1.911957335 6.082553e-02       
## 17  -0.439222357 6.621325e-01       
## 18   1.833540170 7.185504e-02       
## 19   1.229979688 2.236705e-01       
## 20   1.479346400 1.444590e-01       
## 21   0.775781591 4.410299e-01       
## 22   1.611936872 1.124046e-01       
## 23   0.863283235 3.915376e-01       
## 24  -1.720522225 9.066909e-02       
## 25   0.457773239 6.473561e-01       
## 26  -0.071116804 9.433392e-01       
## 27  -0.342996683 7.317755e-01       
## 28   0.512953476 6.082581e-01       
## 29   0.344872484 7.303658e-01       
## 30   4.286269528 2.263645e-05    ***
## 31  -1.218570697 2.237028e-01       
## 32   2.638644634 8.638556e-03     **
## 33   0.524596347 6.001453e-01       
## 34  -0.024899453 9.801472e-01       
## 35  -0.109843699 9.125867e-01       
## 36  -0.496291146 6.199530e-01       
## 37   1.428435985 1.600709e-01       
## 38  -1.392038179 1.707529e-01       
## 39  -1.574275119 1.224291e-01       
## 40  -1.359819624 1.806615e-01       
## 41   1.765983693 8.418234e-02       
## 42   0.640820107 5.248897e-01       
## 43  -0.184671441 8.543165e-01       
## 44  -0.531383954 5.977661e-01       
## 45  -0.346449576 7.306197e-01       
## 46   1.002141835 3.216336e-01       
## 47  -0.593078773 5.560976e-01       
## 48   0.130987851 8.963688e-01       
## 49   0.105408816 9.172172e-01       
## 50   0.102948765 9.191419e-01       
## 51   1.368709935 1.879306e-01       
## 52   1.544068070 1.399708e-01       
## 53  -1.270196914 2.201888e-01       
## 54  -0.348494061 7.315120e-01       
## 55   0.664703036 5.146669e-01       
## 56   0.268250309 7.915567e-01       
## 57   0.331840482 7.438424e-01       
## 58   0.522710614 6.075502e-01       
## 59   0.138117996 8.916805e-01       
## 60  -0.463835145 6.483259e-01       
## 61   1.326407697 1.869950e-01       
## 62  -1.557980179 1.216323e-01       
## 63  -0.513734808 6.082963e-01       
## 64   0.970461117 3.335916e-01       
## 65  -1.860806277 6.499638e-02       
## 66   3.929136188 1.368237e-04    ***
## 67   0.407438971 6.843456e-01       
## 68   2.229702327 2.745701e-02      *
## 69   2.134965032 3.461127e-02      *
## 70   0.410663624 6.819851e-01       
## 71   1.684974739 9.435628e-02       
## 72  -2.144737828 3.380536e-02      *
## 73   0.574244030 5.664679e-01       
## 74  -0.702222843 4.833819e-01       
## 75  -1.015883539 3.109501e-01       
## 76  -1.554813363 1.216204e-01       
## 77  -1.382485496 1.684121e-01       
## 78   3.366195173 9.187795e-04    ***
## 79  -0.730542472 4.659400e-01       
## 80   0.479030778 6.324562e-01       
## 81  -0.102169165 9.187279e-01       
## 82  -1.847347822 6.621952e-02       
## 83   0.313823962 7.539919e-01       
## 84  -1.405801270 1.613827e-01       
## 85  -0.055093740 9.563250e-01       
## 86  -0.223783856 8.240105e-01       
## 87   0.115467855 9.086246e-01       
## 88  -1.180021511 2.446322e-01       
## 89   1.311975887 1.966542e-01       
## 90   1.054991858 2.974603e-01       
## 91  -0.130952202 8.964384e-01       
## 92  -0.006679408 9.947023e-01       
## 93   0.145126785 8.853050e-01       
## 94   0.704320861 4.851202e-01       
## 95   0.345729821 7.312716e-01       
## 96   0.688851719 4.947045e-01       
## 97  -0.697714396 4.859607e-01       
## 98   1.301587582 1.941751e-01       
## 99  -1.472950300 1.419379e-01       
## 100  3.873761827 1.347699e-04    ***
## 101  3.112314870 2.057101e-03     **
## 102  5.707568709 3.032197e-08    ***
## 103  1.668713942 9.634186e-02       
## 104  6.107919246 3.535603e-09    ***
## 105  1.139938646 2.553296e-01       
## 106  2.848967671 4.726386e-03     **
## 107  1.460470051 1.453322e-01       
## 108 -3.248756673 1.306968e-03     **
## 109  1.863117064 6.363754e-02       
## 110 -1.498661181 1.352437e-01       
## 111 -0.676566686 4.993166e-01       
## 112  1.213025112 2.262837e-01       
## 113  1.255354526 2.105412e-01       
## 114  3.522381270 5.096699e-04    ***
## 115 -1.798603353 7.330724e-02       
## 116  0.587614466 5.573301e-01       
## 117  0.051800176 9.587300e-01       
## 118  0.451252370 6.522052e-01       
## 119 -0.700491559 4.842824e-01       
## 120 -0.487785953 6.261359e-01       
## 121  0.006206790 9.950495e-01       
## 122  0.416677055 6.770426e-01       
## 123 -1.984338631 4.760873e-02      *
## 124  3.064043758 2.267633e-03     **
## 125  1.793640050 7.330277e-02       
## 126  6.990001657 6.428035e-12    ***
## 127  0.056716522 9.547872e-01       
## 128  4.925824528 1.049502e-06    ***
## 129  0.474921372 6.349914e-01       
## 130  1.295150066 1.956961e-01       
## 131  0.164303040 8.695401e-01       
## 132 -3.090275726 2.079072e-03     **
## 133  3.228185263 1.561512e-03     **
## 134 -2.849939887 5.053881e-03     **
## 135  2.254685036 2.575135e-02      *
## 136 -1.861427583 6.484250e-02       
## 137 -1.824813936 7.022367e-02       
## 138  3.194067910 1.743551e-03     **
## 139  0.005606948 9.955345e-01       
## 140  2.582980848 1.085137e-02      *
## 141  0.390885968 6.964932e-01       
## 142 -0.560148600 5.762996e-01       
## 143  0.847465389 3.982247e-01       
## 144 -0.905336659 3.668881e-01       
## 145 -0.648278437 5.178302e-01       
## 146  0.788384075 4.317591e-01       
## 147 -0.826848096 4.096818e-01       
## 148  0.941939014 3.477905e-01       
## 149  1.871696380 6.326306e-02       
## 150  3.640592414 3.777047e-04    ***
## 151  0.976555884 3.304155e-01       
## 152  0.403882545 6.868941e-01       
## 153 -1.471217287 1.433993e-01       
## 154  0.970901455 3.332141e-01       
## 155 -1.483331827 1.401566e-01       
## 156 -0.917237877 3.605411e-01       
## 157  0.685753530 4.963865e-01       
## 158 -0.868485351 3.897371e-01       
## 159  1.195026710 2.383369e-01       
## 160  1.055623978 2.967747e-01       
## 161 -1.219142599 2.291422e-01       
## 162  2.051535512 4.606198e-02      *
## 163 -1.214935506 2.307272e-01       
## 164 -1.083786116 2.842325e-01       
## 165 -1.176086440 2.457448e-01       
## 166 -1.578332418 1.214939e-01       
## 167 -0.127165904 8.993756e-01       
## 168 -0.299574771 7.658809e-01       
## 169 -0.192911325 8.476799e-01       
## 170  0.341258900 7.341009e-01       
## 171 -0.589440228 5.577791e-01       
## 172 -0.790005507 4.326358e-01       
## 173 -0.353545428 7.249188e-01       
## 174 -0.948922351 3.464667e-01       
## 175 -1.152640978 2.536294e-01       
## 176  3.226336378 2.032209e-03     **
## 177  0.329837748 7.426714e-01       
## 178  1.388177717 1.702159e-01       
## 179 -1.026209054 3.089126e-01       
## 180 -3.313073348 1.567320e-03     **
## 181 -1.400218884 1.622510e-01       
## 182  1.549623808 1.220513e-01       
## 183 -2.263737366 2.414473e-02      *
## 184  1.951677022 5.169940e-02       
## 185  2.370577460 1.825085e-02      *
## 186  6.001401164 4.510430e-09    ***
## 187  0.798872386 4.248557e-01       
## 188  3.407025184 7.258191e-04    ***
## 189  0.924852555 3.556204e-01       
## 190  2.678299775 7.715862e-03     **
## 191  0.579979317 5.622669e-01       
## 192 -0.974256123 3.305395e-01       
## 193  0.095089786 9.242921e-01       
## 194  0.347154664 7.286614e-01       
## 195 -0.096270385 9.233550e-01       
## 196  2.310043739 2.140444e-02      *
## 197  0.810572674 4.181029e-01       
## 198  5.061499242 6.409637e-07    ***
## 199  0.116190556 9.075610e-01       
## 200  4.519583007 8.215935e-06    ***
## 201  1.276432144 2.025585e-01       
## 202  0.436448339 6.627518e-01       
## 203  1.018425829 3.091038e-01       
## 204 -1.010790442 3.127402e-01       
## 205 -0.760836383 4.472342e-01       
## 206  1.008030482 3.140915e-01       
## 207 -2.100053927 3.639342e-02      *
## 208  3.652907238 2.962844e-04    ***
## 209  1.805507476 7.179929e-02       
## 210  7.169929080 4.040396e-12    ***
## 211  1.236667533 2.169865e-01       
## 212  5.651660108 3.155234e-08    ***
## 213 -0.232512004 8.162675e-01       
## 214  0.635214777 5.256768e-01       
## 215 -0.581776797 5.610675e-01       
## 216 -2.741309377 6.413278e-03     **
library(forcats)

rq3_plot_data <- all_results_rq3 %>%
  mutate(
    # Create the top-level grouping variable for faceting or filtering
    framework = case_when(
      subset_name %in% c("Factual", "Conceptual", "Procedural", "Metacognitive") ~ "Knowledge Type",
      subset_name %in% c("Remember", "Understand", "Analyze", "Evaluate", "Apply", "Create") ~ "Cognitive Process",
      subset_name %in% c("academic_purpose", "social_purpose", "logistics_purpose") ~ "Purpose Type",
      TRUE ~ "Academic Type" # Catches the remaining 5 academic types
    ),
    
    # Define metric category for color coding
    category = case_when(
      metric %in% content_metrics ~ "Content Creation",
      TRUE ~ "Social Engagement"
    ),
    
    # Calculate confidence bounds (95% CI)
    lower_ci = coef_GPT - (1.96 * se_GPT),
    upper_ci = coef_GPT + (1.96 * se_GPT)
  )
# Filter data for Knowledge Types
knowledge_data <- rq3_plot_data %>%
  filter(framework == "Knowledge Type") %>%
  # Reorder subset_name for better presentation within the facet
  mutate(subset_name_ordered = fct_relevel(subset_name, "Factual", "Conceptual", "Procedural", "Metacognitive"),
        metric = factor(metric, levels = rev(custom_metric_order)))

ggplot(knowledge_data, aes(x = coef_GPT, y = metric, color = category)) +
  geom_vline(xintercept = 0, linetype = "dashed", alpha = 0.2, linewidth = 0.5) +
  geom_errorbarh(aes(xmin = lower_ci, xmax = upper_ci), height = 0.2, linewidth = 0.09) +
  geom_point(size = 0.06) +
  geom_text(aes(label = sigsig, x = coef_GPT),
            hjust = ifelse(knowledge_data$coef_GPT > 0, -0.4, 1.4), 
            vjust = 0.5, size = 2) +
  facet_wrap(~ subset_name_ordered, scales = "free_y", ncol = 2) + # Facet by the 4 knowledge types
  scale_color_manual(values = c("Content Creation" = "#0072B2", "Social Engagement" = "#D55E00")) +
  labs(
    title = "RQ3: Effect of GPT Use on Metrics, by Knowledge Type of Discussion Prompt",
    x = "Coefficient of GPT_use (Fixed Effects)",
    y = "Metric",
    color = "Metric Category"
  ) +
  theme_bw(base_size = 6) +
  theme(
    legend.position = "bottom",
    strip.background = element_rect(fill = "gray85"),
    plot.title.position = "plot"
  )

# Filter data for Cognitive Process Types
cognitive_data <- rq3_plot_data %>%
  filter(framework == "Cognitive Process") %>%
  # Reorder subset_name for better presentation (e.g., Bloom's Taxonomy order)
  mutate(subset_name_ordered = fct_relevel(subset_name, "Remember", "Understand", "Apply", "Analyze", "Evaluate", "Create"),
                 metric = factor(metric, levels = rev(custom_metric_order)))

ggplot(cognitive_data, aes(x = coef_GPT, y = metric, color = category)) +
  geom_vline(xintercept = 0, linetype = "dashed", alpha = 0.2, linewidth = 0.5) +
  geom_errorbarh(aes(xmin = lower_ci, xmax = upper_ci), height = 0.2, linewidth = 0.09) +
  geom_point(size = 0.06) +
  geom_text(aes(label = sigsig, x = coef_GPT),
            hjust = ifelse(cognitive_data$coef_GPT > 0, -0.4, 1.4), 
            vjust = 0.5, size = 2) +
  facet_wrap(~ subset_name_ordered, scales = "free_y", ncol = 3) + # Facet by the 6 cognitive types
  scale_color_manual(values = c("Content Creation" = "#0072B2", "Social Engagement" = "#D55E00")) +
  labs(
    title = "RQ3: Effect of GPT Use on Metrics, by Cognitive Process Type of Discussion Prompt",
    x = "Coefficient of GPT_use (Fixed Effects)",
    y = "Metric",
    color = "Metric Category"
  ) +
  theme_bw(base_size = 6) +
  theme(
    legend.position = "bottom",
    strip.background = element_rect(fill = "gray85"),
    plot.title.position = "plot"
  )

# Filter data for Purpose Types
purpose_data <- rq3_plot_data %>%
  filter(framework == "Purpose Type") %>%
  # Reorder for better presentation
  mutate(subset_name_ordered = fct_relevel(subset_name, "academic_purpose", "social_purpose", "logistics_purpose"),        
         metric = factor(metric, levels = rev(custom_metric_order)))

ggplot(purpose_data, aes(x = coef_GPT, y = metric, color = category)) +
  geom_vline(xintercept = 0, linetype = "dashed", alpha = 0.2, linewidth = 0.5) +
  geom_errorbarh(aes(xmin = lower_ci, xmax = upper_ci), height = 0.2, linewidth = 0.09) +
  geom_point(size = 0.06) +
  geom_text(aes(label = sigsig, x = coef_GPT),
            hjust = ifelse(purpose_data$coef_GPT > 0, -0.4, 1.4), 
            vjust = 0.5, size = 2) +
  facet_wrap(~ subset_name_ordered, scales = "free_y", ncol = 2) + # Facet by the 3 purpose types
  scale_color_manual(values = c("Content Creation" = "#0072B2", "Social Engagement" = "#D55E00")) +
  labs(
    title = "RQ3: Effect of GPT Use on Metrics, by Purpose Type of Discussion Prompt",
    x = "Coefficient of GPT_use (Fixed Effects)",
    y = "Metric",
    color = "Metric Category"
  ) +
  theme_bw(base_size = 6) +
  theme(
    legend.position = "bottom",
    strip.background = element_rect(fill = "gray85"),
    plot.title.position = "plot"
  )

# Filter data for Academic Types
academic_data <- rq3_plot_data %>%
  filter(framework == "Academic Type") %>%
  # Reorder for better presentation
  mutate(subset_name_ordered = fct_relevel(subset_name, "knowledge_reproduction", "open_qa", "synthesis_application", "personal_connection", "collaborative_interaction"),        
         metric = factor(metric, levels = rev(custom_metric_order)))

ggplot(academic_data, aes(x = coef_GPT, y = metric, color = category)) +
  geom_vline(xintercept = 0, linetype = "dashed", alpha = 0.2, linewidth = 0.5) +
  geom_errorbarh(aes(xmin = lower_ci, xmax = upper_ci), height = 0.2, linewidth = 0.09) +
  geom_point(size = 0.06) +
  geom_text(aes(label = sigsig, x = coef_GPT),
            hjust = ifelse(academic_data$coef_GPT > 0, -0.4, 1.4), 
            vjust = 0.5, size = 2) +
  facet_wrap(~ subset_name_ordered, scales = "free_y", ncol = 2) + # Facet by the 5 academic types
  scale_color_manual(values = c("Content Creation" = "#0072B2", "Social Engagement" = "#D55E00")) +
  labs(
    title = "RQ3: Effect of GPT Use on Metrics, by Academic Type of Discussion Prompt",
    x = "Coefficient of GPT_use (Fixed Effects)",
    y = "Metric",
    color = "Metric Category"
  ) +
  theme_bw(base_size = 6) +
  theme(
    legend.position = "bottom",
    strip.background = element_rect(fill = "gray85"),
    plot.title.position = "plot"
  )